mappings_in_path <- here("data/processed/childes/all_types_norm_mappings.csv")
tokens_in_path <- here("data/processed/childes/all_tokens_post-norm.csv")


mappings <- read_csv(mappings_in_path)
tokens_raw <- read_csv(tokens_in_path)

Grouping by corpus AND child

by_kid1 <- tokens_raw %>% 
  #mutate(., child_id = group_indices(., corpus, child)) %>% 
  group_by(corpus, child) %>% 
  mutate(corpus_child = paste(corpus, child, sep = "_")) %>% 
  ungroup()

by_kid2 <- by_kid1 %>% 
  dplyr::select(corpus_child, word) %>% 
  group_by(corpus_child) %>% 
  add_count(word) %>% 
  distinct(corpus_child, word, .keep_all = TRUE) %>% 
  ungroup()


by_kid3 <- by_kid2 %>% 
  group_by(corpus_child) %>% 
  add_tally(n) %>% 
  ungroup() %>% 
  rename(N = nn) %>% 
  mutate(freq = n/N,
         trns_freq = log10(freq + 1))
td_matrix_bykid <- by_kid3 %>% 
  dplyr::select(word, corpus_child, trns_freq) %>% 
  spread(key = corpus_child, value = trns_freq, fill = 0)


M_bykid <- td_matrix_bykid %>% 
  dplyr::select(-word) %>% 
  cor()
corrplot(round(M_bykid[1:10, 1:10], 2), method = 'number', tl.srt = 45)

nm_mds_bykid <- isoMDS(d = 1 - M_bykid, k = 2)
## initial  value 29.894866 
## iter   5 value 19.726697
## iter  10 value 16.273466
## iter  15 value 15.962877
## iter  20 value 15.806430
## iter  20 value 15.790962
## iter  20 value 15.779068
## final  value 15.779068 
## converged
coords_bykid <- nm_mds_bykid$points %>% 
  as.data.frame() %>% 
  rename(x = V1, y = V2) %>% 
  rownames_to_column(var = "corpus_child") %>% 
  separate(corpus_child, c("corpus", "child"), sep = "_", remove = FALSE) 


coords_bykid <- coords_bykid %>% 
  mutate(corpus = ifelse(corpus == "MacWhinney", "McW",
                         ifelse(corpus == "EllisWeismer", "EW",
                                corpus)))
ggplot(coords_bykid, aes(x, y, label = corpus, color = corpus)) + 
  geom_text() +
  theme_minimal() +
  guides(color = FALSE) 

coords_bykid %>% 
  mutate(corpus = ifelse(corpus_child %in% c("Cornell_Felicia", "Cornell_Rhonda", "Warren_Jmarkey",
                                           "Cornell_Sarah", "Warren_Gina", "Warren_Mary",
                                           "Warren_David", "Warren_Louise"), "REMOVE", corpus)) %>% 
  ggplot(aes(x, y, label = corpus, color = corpus)) +
  geom_text() +
  theme_minimal() +
  guides(color = FALSE)

# ggplot(coords_bykid, aes(x, y, label = corpus, color = corpus)) + 
#   geom_text(size = 3.5) +
#   xlim(-0.75, 1) +
#   ylim(-0.75, 1) + 
#   theme_minimal() +
#   guides(color = FALSE) +
#   labs(title = "ZOOMED IN")

Grouping by corpus only - remove none

by_corpus1 <- tokens_raw %>% 
  dplyr::select(corpus, word) %>% 
  group_by(corpus) %>% 
  add_count(word) %>% 
  distinct(corpus, word, .keep_all = TRUE) %>% 
  ungroup()


by_corpus2 <- by_corpus1 %>% 
  group_by(corpus) %>% 
  add_tally(n) %>% 
  ungroup() %>% 
  rename(N = nn) %>% 
  mutate(freq = n/N,
         trns_freq = log10(freq + 1))
td_matrix_bycorpus <- by_corpus2 %>% 
  dplyr::select(word, corpus, trns_freq) %>% 
  spread(key = corpus, value = trns_freq, fill = 0)


M_bycorpus <- td_matrix_bycorpus %>% 
  dplyr::select(-word) %>% 
  cor()
corrplot(round(M_bycorpus, 2), method = 'number', tl.srt = 45)

nm_mds_bycorpus <- isoMDS(d = 1 - M_bycorpus, k = 2)
## initial  value 18.171402 
## iter   5 value 14.421080
## iter  10 value 13.116299
## iter  15 value 12.256268
## iter  15 value 12.247430
## iter  15 value 12.247430
## final  value 12.247430 
## converged
coords_bycorpus <- nm_mds_bycorpus$points %>% 
  as.data.frame() %>% 
  rename(x = V1, y = V2) %>% 
  rownames_to_column(var = "corpus")
ggplot(coords_bycorpus, aes(x, y, label = corpus, color = corpus)) + 
  geom_point(alpha = 0.7) +
  geom_text_repel() +
  theme_minimal() +
  guides(color = FALSE) 

Grouping by corpus only - remove outliers

outliers <- c("Cornell_Felicia", "Cornell_Rhonda", "Warren_Jmarkey",
                                           "Cornell_Sarah", "Warren_Gina", "Warren_Mary",
                                           "Warren_David", "Warren_Louise")


removed1 <- tokens_raw %>% 
  filter(!paste(corpus, child, sep = "_") %in% outliers) %>% 
  dplyr::select(corpus, word) %>% 
  group_by(corpus) %>% 
  add_count(word) %>% 
  distinct(corpus, word, .keep_all = TRUE) %>% 
  ungroup()


removed2 <- removed1 %>% 
  group_by(corpus) %>% 
  add_tally(n) %>% 
  ungroup() %>% 
  rename(N = nn) %>% 
  mutate(freq = n/N,
         trns_freq = log10(freq + 1))
td_matrix_removed <- removed2 %>% 
  dplyr::select(word, corpus, trns_freq) %>% 
  spread(key = corpus, value = trns_freq, fill = 0)


M_removed <- td_matrix_removed %>% 
  dplyr::select(-word) %>% 
  cor()
corrplot(round(M_removed, 2), method = 'number', tl.srt = 45)

nm_mds_removed <- isoMDS(d = 1 - M_removed, k = 2)
## initial  value 20.851396 
## iter   5 value 17.361216
## iter  10 value 13.896642
## iter  15 value 13.574920
## iter  20 value 12.554707
## final  value 12.386920 
## converged
coords_removed <- nm_mds_removed$points %>% 
  as.data.frame() %>% 
  rename(x = V1, y = V2) %>% 
  rownames_to_column(var = "corpus")
ggplot(coords_removed, aes(x, y, label = corpus, color = corpus)) + 
  geom_point(alpha = 0.7) +
  geom_text_repel() +
  theme_minimal() +
  guides(color = FALSE) 

Visualizations

corpus

corpus + child

corpus + child outliers removed